# Load libraries
library(readr)
library(dplyr)
library(tidyverse)
library(Seurat) |> suppressWarnings()
library(patchwork)
library(ggplot2)
library(sctransform)
library(matrixStats)
library(purrr)
library(BayesTools)
library(VGAM) |> suppressWarnings()
library(knitr) |> suppressWarnings()
library(Matrix)
library(fastglmpca)
library(cowplot)
library(gridExtra)
library(ggpubr)
library(motifcluster)
library(cluster)
library(mclust)
library(tightClust)
library(devtools)
library(biomaRt) |> suppressWarnings()
library(presto)
library(writexl)
# library(SingleR)
set.seed(2024)STAT M254 Final Project
BoneMarrow and Pancreas Data Analysis
1 Introduction
In this project, I will analyze two datasets: BoneMarrow and Pancreas. The BoneMarrow dataset contains 2 samples, and the Pancreas dataset contains 3 samples. I will perform the following analysis:
- Setup the Seurat Object
- Standard pre-processing workfow
- Normalizing the data
- Identification of highly variable features (feature selection)
- Scaling the data
- Perform linear dimensional reduction
- Determine the ‘dimensionality’ of the dataset
- Cluster the cells
- Run non-linear dimensionalreduction (UMAP/tSNE)
- Finding differentially expressedfeatures (cluster biomarkers)
- Assigning cell type identity toclusters
2 Part 1: BoneMarrow
2.1 Read the data
# Load the BoneMarrow dataset
BoneMarrow_data1 <- readRDS(
paste("/Users/jacenai/Desktop/STAT_M254/",
"Final_Project/Datasets_final/",
"BoneMarrow_dataset1.rds",
sep = "")) |>
as.data.frame()
BoneMarrow_data2 <- readRDS(
paste("/Users/jacenai/Desktop/STAT_M254/",
"Final_Project/Datasets_final/",
"BoneMarrow_dataset2.rds",
sep = "")) |>
as.data.frame()2.2 Create Seurat and QC
# Create the BoneMarrow1 Seurat object
BoneMarrow1 <- Seurat::CreateSeuratObject(
counts = BoneMarrow_data1,
project = "BoneMarrow1",
assay = "DNA",
min.cells = 10,
min.features = 200) |>
suppressWarnings()
# Find mitochondrial genes for BoneMarrow1
BoneMarrow1[["percent.mt"]] <- PercentageFeatureSet(
BoneMarrow1,
pattern = "^MT-") |>
suppressWarnings()
# Create the BoneMarrow2 Seurat object
BoneMarrow2 <- Seurat::CreateSeuratObject(
counts = BoneMarrow_data2,
project = "BoneMarrow2",
assay = "DNA",
min.cells = 10,
min.features = 200) |>
suppressWarnings()
# Find mitochondrial genes for BoneMarrow2
BoneMarrow2[["percent.mt"]] <- PercentageFeatureSet(
BoneMarrow2,
pattern = "^MT-") |>
suppressWarnings()
VlnPlot(BoneMarrow1,
features = c("nFeature_DNA",
"nCount_DNA",
"percent.mt"),
ncol = 3) |>
suppressWarnings()VlnPlot(BoneMarrow2,
features = c("nFeature_DNA",
"nCount_DNA",
"percent.mt"),
ncol = 3) |>
suppressWarnings()2.3 Transformation & Feature selection & Scaling
I will try log1pCPM, and SCTransform to transform the data and compare the results.
# Transform the BoneMarrow1 data
# Seurat default log-normalization (log1pCP10k)
pbmc_log1pCP1k_1 <- NormalizeData(
BoneMarrow1,
normalization.method = "LogNormalize",
scale.factor = 1000)
# identify the highly variable genes
pbmc_features_log1pCP1k_1 <- FindVariableFeatures(
pbmc_log1pCP1k_1,
selection.method = "vst",
nfeatures = 2000) # tune this parameter
# Scale the data
pbmc_scaled_log1pCP1k_1 <-
ScaleData(pbmc_features_log1pCP1k_1,
features =
VariableFeatures(pbmc_features_log1pCP1k_1))
# Transform the BoneMarrow2 data
# Seurat default log-normalization (log1pCP10k)
pbmc_log1pCP1k_2 <- NormalizeData(
BoneMarrow2,
normalization.method = "LogNormalize",
scale.factor = 1000)
# identify the highly variable genes
pbmc_features_log1pCP1k_2 <- FindVariableFeatures(
pbmc_log1pCP1k_2,
selection.method = "vst",
nfeatures = 2000) # tune this parameter
# Scale the data
pbmc_scaled_log1pCP1k_2 <-
ScaleData(pbmc_features_log1pCP1k_2,
features =
VariableFeatures(pbmc_features_log1pCP1k_2))Try SCTransform to transform the data.
# SCTransform
# SCTransform the BoneMarrow1 data
BoneMarrow1_SCTransform <- SCTransform(
BoneMarrow1,
verbose = FALSE,
variable.features.n = 3000, # tune this parameter
return.only.var.genes = TRUE,
assay = "DNA",
vars.to.regress = c("nCount_DNA", "percent.mt"))
# SCTransform the BoneMarrow2 data
BoneMarrow2_SCTransform <- SCTransform(
BoneMarrow2,
verbose = FALSE,
variable.features.n = 3000, # tune this parameter
return.only.var.genes = TRUE,
assay = "DNA",
vars.to.regress = c("nCount_DNA", "percent.mt"))Calculate the variance of transformed data and plot it against the mean.
# Calculate the gene variance of transformed data
# and plot it against the gene mean
# BoneMarrow1 Log1pCPM
gene_var_log1pCPM_1 <-
pbmc_scaled_log1pCP1k_1[["DNA"]]@layers$scale.data |>
apply(1, var)
gene_mean_log1pCPM_1 <-
pbmc_scaled_log1pCP1k_1[["DNA"]]@layers$scale.data |>
apply(1, mean)
# BoneMarrow2 Log1pCPM
gene_var_log1pCPM_2 <-
pbmc_scaled_log1pCP1k_2[["DNA"]]@layers$scale.data |>
apply(1, var)
gene_mean_log1pCPM_2 <-
pbmc_scaled_log1pCP1k_2[["DNA"]]@layers$scale.data |>
apply(1, mean)
# BoneMarrow1 SCTransform
gene_var_SCTransform_1 <-
BoneMarrow1_SCTransform[["SCT"]]@scale.data |>
apply(1, var)
gene_mean_SCTransform_1 <-
BoneMarrow1_SCTransform[["SCT"]]@scale.data |>
apply(1, mean)
# BoneMarrow2 SCTransform
gene_var_SCTransform_2 <-
BoneMarrow2_SCTransform[["SCT"]]@scale.data |>
apply(1, var)
gene_mean_SCTransform_2 <-
BoneMarrow2_SCTransform[["SCT"]]@scale.data |>
apply(1, mean)
# Plot the gene variance against the gene mean
# Create a data frame for plotting
gene_var_mean_df <-
data.frame(Gene_Variance =
c(gene_var_log1pCPM_1,
gene_var_log1pCPM_2,
gene_var_SCTransform_1,
gene_var_SCTransform_2),
Gene_Mean =
c(gene_mean_log1pCPM_1,
gene_mean_log1pCPM_2,
gene_mean_SCTransform_1,
gene_mean_SCTransform_2),
Method =
c(rep("Log1pCPM",
length(gene_var_log1pCPM_1) +
length(gene_var_log1pCPM_2)),
rep("SCTransform",
length(gene_var_SCTransform_1) +
length(gene_var_SCTransform_2))))
# Plot the gene means vs. different variances
ggplot(gene_var_mean_df, aes(x = Gene_Mean,
y = Gene_Variance,
color = Method)) +
geom_point(alpha = 0.3) +
theme_grey() +
facet_wrap(~ Method, scales = "free") +
labs(title = "Gene Variance vs. Gene Mean",
x = "Gene Mean",
y = "Gene Variance") +
theme(legend.position = "none") +
scale_x_continuous(breaks = scales::pretty_breaks(n = 5))# Calculate the Spearman correlation
correlation_log1pCPM_1 <-
cor(gene_var_log1pCPM_1, gene_mean_log1pCPM_1,
method = "spearman")
correlation_log1pCPM_2 <-
cor(gene_var_log1pCPM_2, gene_mean_log1pCPM_2,
method = "spearman")
correlation_SCTransform_1 <-
cor(gene_var_SCTransform_1, gene_mean_SCTransform_1,
method = "spearman")
correlation_SCTransform_2 <-
cor(gene_var_SCTransform_2, gene_mean_SCTransform_2,
method = "spearman")
# print the Spearman correlation results
cbind(Method = c("Log1pCPM", "Log1pCPM",
"SCTransform", "SCTransform"),
Dataset = c("BoneMarrow1", "BoneMarrow2",
"BoneMarrow1", "BoneMarrow2"),
Correlation = c(correlation_log1pCPM_1,
correlation_log1pCPM_2,
correlation_SCTransform_1,
correlation_SCTransform_2)) |>
knitr::kable()| Method | Dataset | Correlation |
|---|---|---|
| Log1pCPM | BoneMarrow1 | 0.974603251887739 |
| Log1pCPM | BoneMarrow2 | 0.961849201395136 |
| SCTransform | BoneMarrow1 | -0.01872351035011 |
| SCTransform | BoneMarrow2 | 0.00471352040124233 |
It can be seen from the resutls that the SCTransform method has a lower correlation between gene variance and gene mean compared to the Log1pCPM method. This indicates that the SCTransform method is better at normalizing the data. Therefore, I will use the SCTransform method for downstream analysis.
2.4 Linear dimensional reduction
# perform PCA
BoneMarrow1_pbmc_PCA <- RunPCA(
BoneMarrow1_SCTransform,
features = VariableFeatures(BoneMarrow1_SCTransform),
npcs = 100, # tune this parameter
verbose = FALSE)
ElbowPlot(BoneMarrow1_pbmc_PCA, ndims = 100)BoneMarrow2_pbmc_PCA <- RunPCA(
BoneMarrow2_SCTransform,
features = VariableFeatures(BoneMarrow2_SCTransform),
npcs = 100, # tune this parameter
verbose = FALSE)
ElbowPlot(BoneMarrow2_pbmc_PCA, ndims = 100)2.5 Get the gene names
Ensembl IDs to gene
mrt = useMart(biomart = "ensembl",
dataset = "hsapiens_gene_ensembl")
if (file.exists("BoneMarrow_data1_IDs.rds")) {
BoneMarrow_data1_IDs <- readRDS("BoneMarrow_data1_IDs.rds")
} else {
BoneMarrow_data1_IDs <- getBM(
attributes = c("ensembl_gene_id","hgnc_symbol"),
filters = 'ensembl_gene_id',
values = rownames(BoneMarrow_data1),
mart = mrt)
saveRDS(BoneMarrow_data1_IDs, "BoneMarrow_data1_IDs.rds")
}
if (file.exists("BoneMarrow_data2_IDs.rds")) {
BoneMarrow_data2_IDs <- readRDS("BoneMarrow_data2_IDs.rds")
} else {
BoneMarrow_data2_IDs <- getBM(
attributes = c("ensembl_gene_id","hgnc_symbol"),
filters = 'ensembl_gene_id',
values = rownames(BoneMarrow_data2),
mart = mrt)
saveRDS(BoneMarrow_data2_IDs, "BoneMarrow_data2_IDs.rds")
}2.6 Clustering
Perform neighborhood-based clustering (I tried different parameters and the ones below gave the best results).
# Perform neighborhood-based clustering
BoneMarrow1_pbmc_neighbor <- FindNeighbors(
BoneMarrow1_pbmc_PCA,
reduction = "pca",
dims = 1:100, # tune this parameter
k.param = 30, # tune this parameter
prune.SNN = 1/10, # tune this parameter
verbose = FALSE)
BoneMarrow1_pbmc_cluster <- FindClusters(
BoneMarrow1_pbmc_neighbor,
resolution = 1.8, # tune this parameter
verbose = FALSE)
BoneMarrow2_pbmc_neighbor <- FindNeighbors(
BoneMarrow2_pbmc_PCA,
reduction = "pca",
dims = 1:100, # tune this parameter
k.param = 40, # tune this parameter
prune.SNN = 1/10, # tune this parameter
verbose = FALSE)
BoneMarrow2_pbmc_cluster <- FindClusters(
BoneMarrow2_pbmc_neighbor,
resolution = 0.8, # tune this parameter
verbose = FALSE)Run UMAP and t-SNE
# Run UMAP
BoneMarrow1_pbmc_UMAP <- RunUMAP(
BoneMarrow1_pbmc_cluster,
min.dist = 0.3,
n.neighbors = 30,
dims = 1:79,
metric = "cosine",
verbose = FALSE) |>
suppressWarnings()
BoneMarrow2_pbmc_UMAP <- RunUMAP(
BoneMarrow2_pbmc_cluster,
min.dist = 0.3,
n.neighbors = 30,
dims = 1:87,
metric = "cosine",
verbose = FALSE)
# try t_SNE
BoneMarrow1_pbmc_tSNE <- RunTSNE(
BoneMarrow1_pbmc_cluster,
dims = 1:50,
verbose = FALSE)
BoneMarrow2_pbmc_tSNE <- RunTSNE(
BoneMarrow2_pbmc_cluster,
dims = 1:98,
verbose = FALSE)Plot the UMAP
DimPlot(BoneMarrow1_pbmc_UMAP,
reduction = "umap",
label = TRUE,
group.by = "seurat_clusters",
alpha = 0.5) +
ggtitle("BoneMarrow1 UMAP")DimPlot(BoneMarrow2_pbmc_UMAP,
reduction = "umap",
label = TRUE,
group.by = "seurat_clusters",
alpha = 0.5) +
ggtitle("BoneMarrow2 UMAP")Plot the t-SNE
DimPlot(BoneMarrow1_pbmc_tSNE,
reduction = "tsne",
label = TRUE,
group.by = "seurat_clusters",
alpha = 0.5) +
ggtitle("BoneMarrow1 t-SNE")DimPlot(BoneMarrow2_pbmc_tSNE,
reduction = "tsne",
label = TRUE,
group.by = "seurat_clusters",
alpha = 0.5) +
ggtitle("BoneMarrow2 t-SNE")By comparing the UMAP and t-SNE plots, I decided to use the UMAP for both datasets.
2.7 Annotation Results
2.7.1 BoneMarrow1 data
# Find gene markers for the BoneMarrow1 data
# use the UMAP for the BoneMarrow1 data to find the gene markers
if (file.exists("BoneMarrow1_pbmc_markers.rds")) {
BoneMarrow1_pbmc_markers <- readRDS("BoneMarrow1_pbmc_markers.rds")
} else {
BoneMarrow1_pbmc_markers <-
FindAllMarkers(
BoneMarrow1_pbmc_UMAP,
min.pct = 0.01,
logfc.threshold = 0.01,
test.use = "wilcox",
verbose = FALSE)
saveRDS(BoneMarrow1_pbmc_markers, "BoneMarrow1_pbmc_markers.rds")
}
# select the top 5 highly expressed genes in each cluster
# to roughly identify the cell types
BoneMarrow1_pbmc_markers_top_5 <-
BoneMarrow1_pbmc_markers |>
as_tibble() |>
filter(avg_log2FC > 4) |>
filter(p_val_adj < 0.01) |>
group_by(cluster, p_val_adj) |>
# arrange the gene with the order of
# high in avg_log2FC and low in p_val_adj
arrange(cluster, p_val_adj, desc(avg_log2FC)) |>
mutate(gene_names = map_chr(
gene, ~ BoneMarrow_data1_IDs$hgnc_symbol[
match(.x, BoneMarrow_data1_IDs$ensembl_gene_id)])) |>
# select the top 5 genes in each cluster
group_by(cluster) |>
dplyr::slice(1:5) |>
dplyr::select(cluster, gene_names, everything())
# export the top 5 highly expressed genes each cluster to a xlsx file
if (!file.exists("BoneMarrow1_pbmc_markers_top_5.xlsx")) {
write_xlsx(BoneMarrow1_pbmc_markers_top_5,
"BoneMarrow1_pbmc_markers_top_5.xlsx")
}
BoneMarrow1_pbmc_markers_top_5 |>
knitr::kable()| cluster | gene_names | p_val | avg_log2FC | pct.1 | pct.2 | p_val_adj | gene |
|---|---|---|---|---|---|---|---|
| 0 | CCR4 | 0 | 4.874914 | 0.382 | 0.014 | 0.0000000 | ENSG00000183813 |
| 0 | TTC39C-AS1 | 0 | 4.401808 | 0.091 | 0.004 | 0.0000000 | ENSG00000264745 |
| 0 | WNT7A | 0 | 4.294893 | 0.087 | 0.004 | 0.0000000 | ENSG00000154764 |
| 0 | FOXP3 | 0 | 4.169362 | 0.044 | 0.002 | 0.0000000 | ENSG00000049768 |
| 1 | MAFB | 0 | 4.238447 | 0.972 | 0.106 | 0.0000000 | ENSG00000204103 |
| 1 | THBS1 | 0 | 4.626654 | 0.816 | 0.069 | 0.0000000 | ENSG00000137801 |
| 1 | PID1 | 0 | 4.263354 | 0.624 | 0.039 | 0.0000000 | ENSG00000153823 |
| 1 | CYP1B1 | 0 | 5.438358 | 0.543 | 0.025 | 0.0000000 | ENSG00000138061 |
| 1 | CD300E | 0 | 4.320640 | 0.571 | 0.030 | 0.0000000 | ENSG00000186407 |
| 2 | KLRC2 | 0 | 4.734576 | 0.829 | 0.053 | 0.0000000 | ENSG00000205809 |
| 2 | TKTL1 | 0 | 5.867000 | 0.566 | 0.017 | 0.0000000 | ENSG00000007350 |
| 2 | KIR3DL2 | 0 | 4.176698 | 0.483 | 0.028 | 0.0000000 | ENSG00000240403 |
| 2 | KLRC3 | 0 | 4.646220 | 0.415 | 0.021 | 0.0000000 | ENSG00000205810 |
| 2 | LDB2 | 0 | 4.309103 | 0.102 | 0.005 | 0.0000000 | ENSG00000169744 |
| 3 | AZU1 | 0 | 7.113508 | 0.741 | 0.034 | 0.0000000 | ENSG00000172232 |
| 3 | MPO | 0 | 7.051123 | 0.615 | 0.030 | 0.0000000 | ENSG00000005381 |
| 3 | ELANE | 0 | 7.931680 | 0.498 | 0.014 | 0.0000000 | ENSG00000197561 |
| 3 | RNASE2 | 0 | 4.601070 | 0.678 | 0.054 | 0.0000000 | ENSG00000169385 |
| 3 | CTSG | 0 | 7.272826 | 0.390 | 0.007 | 0.0000000 | ENSG00000100448 |
| 4 | TNFRSF9 | 0 | 5.339711 | 0.261 | 0.012 | 0.0000000 | ENSG00000049249 |
| 4 | CCL3-AS1 | 0 | 4.315129 | 0.227 | 0.017 | 0.0000000 | ENSG00000277089 |
| 4 | CXCR6 | 0 | 4.409252 | 0.177 | 0.010 | 0.0000000 | ENSG00000172215 |
| 4 | DKK3 | 0 | 4.144354 | 0.148 | 0.010 | 0.0000000 | ENSG00000050165 |
| 4 | VCAM1 | 0 | 6.842211 | 0.025 | 0.000 | 0.0000000 | ENSG00000162692 |
| 5 | MS4A1 | 0 | 7.384164 | 0.866 | 0.015 | 0.0000000 | ENSG00000156738 |
| 5 | CD79A | 0 | 7.144223 | 0.985 | 0.034 | 0.0000000 | ENSG00000105369 |
| 5 | LINC00926 | 0 | 7.117973 | 0.738 | 0.012 | 0.0000000 | ENSG00000247982 |
| 5 | BANK1 | 0 | 6.714238 | 0.713 | 0.019 | 0.0000000 | ENSG00000153064 |
| 5 | FCER2 | 0 | 7.537949 | 0.554 | 0.006 | 0.0000000 | ENSG00000104921 |
| 6 | MYOM2 | 0 | 5.380428 | 0.885 | 0.050 | 0.0000000 | ENSG00000036448 |
| 6 | FGFBP2 | 0 | 4.273741 | 0.935 | 0.093 | 0.0000000 | ENSG00000137441 |
| 6 | SH2D1B | 0 | 4.317394 | 0.620 | 0.037 | 0.0000000 | ENSG00000198574 |
| 6 | SPON2 | 0 | 4.029231 | 0.975 | 0.208 | 0.0000000 | ENSG00000159674 |
| 6 | LINC00299 | 0 | 5.023763 | 0.370 | 0.016 | 0.0000000 | ENSG00000236790 |
| 7 | FSBP | 0 | 5.568025 | 0.012 | 0.000 | 0.0002713 | ENSG00000265817 |
| 7 | CELA1 | 0 | 5.568025 | 0.012 | 0.000 | 0.0002713 | ENSG00000139610 |
| 8 | CCL3L3 | 0 | 4.485081 | 0.781 | 0.057 | 0.0000000 | ENSG00000276085 |
| 8 | KCNE1 | 0 | 4.955907 | 0.323 | 0.014 | 0.0000000 | ENSG00000180509 |
| 8 | TNFAIP6 | 0 | 4.494003 | 0.303 | 0.022 | 0.0000000 | ENSG00000123610 |
| 8 | 0 | 4.503048 | 0.181 | 0.008 | 0.0000000 | ENSG00000287553 | |
| 8 | CXCL1 | 0 | 4.981095 | 0.168 | 0.008 | 0.0000000 | ENSG00000163739 |
| 9 | TRAV14DV4 | 0 | 4.364786 | 0.263 | 0.014 | 0.0000000 | ENSG00000211792 |
| 9 | TRDV2 | 0 | 6.154833 | 0.180 | 0.005 | 0.0000000 | ENSG00000211821 |
| 9 | ZNF683 | 0 | 4.564768 | 0.218 | 0.010 | 0.0000000 | ENSG00000176083 |
| 9 | TRBV5-1 | 0 | 5.172667 | 0.241 | 0.014 | 0.0000000 | ENSG00000211734 |
| 9 | TRGV2 | 0 | 4.735881 | 0.158 | 0.009 | 0.0000000 | ENSG00000233306 |
| 10 | SLC4A10 | 0 | 7.069240 | 0.250 | 0.002 | 0.0000000 | ENSG00000144290 |
| 10 | TRAV1-2 | 0 | 4.894153 | 0.189 | 0.006 | 0.0000000 | ENSG00000256553 |
| 10 | LINC01644 | 0 | 5.024152 | 0.144 | 0.004 | 0.0000000 | ENSG00000218357 |
| 10 | TRBV6-4 | 0 | 5.680197 | 0.091 | 0.002 | 0.0000000 | ENSG00000211713 |
| 10 | KIF5C | 0 | 4.225359 | 0.091 | 0.005 | 0.0000000 | ENSG00000168280 |
| 11 | LINC02812 | 0 | 9.882266 | 0.825 | 0.001 | 0.0000000 | ENSG00000230138 |
| 11 | LILRA4 | 0 | 8.612708 | 0.992 | 0.012 | 0.0000000 | ENSG00000239961 |
| 11 | PTPRS | 0 | 8.572198 | 0.775 | 0.003 | 0.0000000 | ENSG00000105426 |
| 11 | CLEC4C | 0 | 8.503754 | 0.850 | 0.004 | 0.0000000 | ENSG00000198178 |
| 11 | SCT | 0 | 8.283629 | 0.592 | 0.002 | 0.0000000 | ENSG00000070031 |
| 12 | SPTA1 | 0 | 6.572461 | 0.030 | 0.000 | 0.0000000 | ENSG00000163554 |
| 12 | SELP | 0 | 6.350069 | 0.030 | 0.000 | 0.0000000 | ENSG00000174175 |
| 12 | MFSD2B | 0 | 5.765106 | 0.030 | 0.000 | 0.0000000 | ENSG00000205639 |
| 12 | POLQ | 0 | 5.765106 | 0.030 | 0.000 | 0.0000000 | ENSG00000051341 |
| 12 | B3GALNT1 | 0 | 5.765106 | 0.030 | 0.000 | 0.0000000 | ENSG00000169255 |
| 13 | CRHBP | 0 | 11.350012 | 0.762 | 0.000 | 0.0000000 | ENSG00000145708 |
| 13 | CD34 | 0 | 8.263918 | 0.845 | 0.004 | 0.0000000 | ENSG00000174059 |
| 13 | CYTL1 | 0 | 7.493245 | 0.690 | 0.007 | 0.0000000 | ENSG00000170891 |
| 13 | NPR3 | 0 | 8.010162 | 0.560 | 0.003 | 0.0000000 | ENSG00000113389 |
| 13 | EGFL7 | 0 | 5.907402 | 0.833 | 0.019 | 0.0000000 | ENSG00000172889 |
| 14 | LINC02446 | 0 | 5.434920 | 0.836 | 0.037 | 0.0000000 | ENSG00000256039 |
| 14 | CD8B | 0 | 4.262326 | 0.970 | 0.076 | 0.0000000 | ENSG00000172116 |
| 14 | C20orf204 | 0 | 4.298116 | 0.209 | 0.011 | 0.0000000 | ENSG00000196421 |
| 14 | PCSK1N | 0 | 4.345422 | 0.224 | 0.015 | 0.0000000 | ENSG00000102109 |
| 14 | CD248 | 0 | 7.667350 | 0.045 | 0.000 | 0.0000000 | ENSG00000174807 |
| 15 | CLEC10A | 0 | 7.985711 | 0.578 | 0.004 | 0.0000000 | ENSG00000132514 |
| 15 | ENHO | 0 | 7.668228 | 0.356 | 0.002 | 0.0000000 | ENSG00000168913 |
| 15 | CD1C | 0 | 6.268298 | 0.578 | 0.013 | 0.0000000 | ENSG00000158481 |
| 15 | FCER1A | 0 | 5.012487 | 0.844 | 0.040 | 0.0000000 | ENSG00000179639 |
| 15 | AXL | 0 | 6.400748 | 0.289 | 0.004 | 0.0000000 | ENSG00000167601 |
| 16 | LYPD2 | 0 | 13.985578 | 0.543 | 0.000 | 0.0000000 | ENSG00000197353 |
| 16 | UICLM | 0 | 10.106432 | 0.629 | 0.000 | 0.0000000 | ENSG00000233392 |
| 16 | HES4 | 0 | 7.714115 | 0.514 | 0.003 | 0.0000000 | ENSG00000188290 |
| 16 | CDKN1C | 0 | 6.806037 | 0.571 | 0.007 | 0.0000000 | ENSG00000129757 |
| 16 | VMO1 | 0 | 9.205968 | 0.257 | 0.000 | 0.0000000 | ENSG00000182853 |
| 17 | SDC1 | 0 | 10.283611 | 0.548 | 0.000 | 0.0000000 | ENSG00000115884 |
| 17 | TNFRSF17 | 0 | 8.133368 | 0.645 | 0.003 | 0.0000000 | ENSG00000048462 |
| 17 | JSRP1 | 0 | 10.605539 | 0.452 | 0.000 | 0.0000000 | ENSG00000167476 |
| 17 | IGLV3-1 | 0 | 11.477532 | 0.581 | 0.005 | 0.0000000 | ENSG00000211673 |
| 17 | PARM1 | 0 | 7.476256 | 0.452 | 0.003 | 0.0000000 | ENSG00000169116 |
By looking up the highly expressed genes in each cluster online and in the literature, I can roughly identify the cell types in the BoneMarrow1 data. I found the following cell types:
| Cluster | Cell Type |
|---|---|
| 0 | Regulatory T-cell |
| 1 | Macrophage Cell |
| 2 | Natural Killer Cell |
| 3 | Neutrophil Precursors |
| 4 | CD8 T Cell |
| 5 | Naïve B Cell |
| 6 | Natural Killer Cell |
| 7 | Megakaryocyte Cell |
| 8 | Macrophage Cell |
| 9 | CD8 T Cell |
| 10 | Mucosal-associated Invariant T Cell |
| 11 | Plasmacytoid Dendritic Cell |
| 12 | Erythroid Cells |
| 13 | Hematopoietic Stem Cells |
| 14 | CD8 T Cell |
| 15 | Dendritic Cell |
| 16 | Non-classical Monocyte Cell |
| 17 | Plasma Cell |
Since I already have the primary cell types for the BoneMarrow1 data from the top 5 highly expressed genes in each cluster, we can verfiy the cell types by gene markers from the literature or from online databases (e.g., CellMarker 2.0, PanglaoDB)
And I found the following gene markers for the cell types in the BoneMarrow1 data:
| Cell Type | Gene Markers |
|---|---|
| Regulatory T-cell | CD25, CD4, FOXP3, CCR4, CCR6 |
| Macrophage Cell | ARG, CCL2, CD163, CD206, FIZZ1, IL-10, MRC1, CD163, CD206, CD14 |
| Natural Killer Cell | TRDC, KLRF1, GNLY, NCR1, GZMA, HOPX |
| Neutrophil Precursors | S100A8, S100A9, S100A12, LCN2, LTF |
| CD8 T Cell | CD3, CD5, CD8, CD27, CD28 |
| Naïve B Cell | P2RX5, PTPRCAP |
| Natural Killer Cell | TRDC, KLRF1, GNLY, NCR1, GZMA, HOPX |
| Megakaryocyte Cell | IL6ST |
| Macrophage Cell | ARG, CCL2, CD163, CD206, FIZZ1, IL-10, MRC1, CD163, CD206, CD14 |
| CD8 T Cell | CD3, CD5, CD8, CD27, CD28 |
| Mucosal-associated Invariant T Cell | TRAV1-2, TRAJ33, TRAJ20, TRAJ12, SLC4A10 |
| Plasmacytoid Dendritic Cell | BDCA2, BDCA-4, CD123, CD303, CD304, CLEC4C, DR6, Fc-epsilon RI-alpha, ILT3, ILT7, NRP1 |
| Erythroid Cells | AHSP, HBA-A1, HBA-A2, HIST1H2AO, STFA1, ALAD, PNPO |
| Hematopoietic Stem Cells | CD34, MECOM, EGR1 |
| CD8 T Cell | CD8A, CD8B, CD3D, CD3E, CD3G, CD4, CD5, CD6, CD7, CD8, CD8, CD8 |
| Dendritic Cell | CD1C, CD86 |
| Non-classical Monocyte Cell | CD14, CD16, CX3CR1, LYPD2 |
| Plasma Cell | MZB1, IGHG1, JCHAIN, SPAG4, TGM5 |
Then plot feature plots for the classic gene markers for each cell type to verify the cell types in the BoneMarrow1 data.
- Regulatory T-cell: CD25, CD4, FOXP3, CCR4, CCR6
# define a function to plot feature plots for the classic gene markers
plot_feature_plots <- function(cell_type, classic_gene_markers) {
# map the gene names back to the ENSEMBL IDs
classic_gene_maker_names <- c(
classic_gene_markers)
Ensembl <- getBM(
attributes=c("ensembl_gene_id","hgnc_symbol"),
filters = 'hgnc_symbol', values = classic_gene_maker_names,
mart = mrt)
feature_plot <- FeaturePlot(
cell_type,
features = Ensembl$ensembl_gene_id,
pt.size = 0.1) |>
suppressWarnings()
violin_plot <- VlnPlot(
cell_type,
features = Ensembl$ensembl_gene_id,
pt.size = 0.1,
ncol = 2) |>
suppressWarnings()
gene_id_names <- Ensembl |>
# pivot_wider(names_from = hgnc_symbol,
# values_from = ensembl_gene_id) |>
kable() |>
suppressWarnings()
return(list(
gene_id_names = gene_id_names,
feature_plot = feature_plot,
violin_plot = violin_plot
))
}
# map the gene names back to the ENSEMBL IDs
Regulatory_T_classic_gene_maker_names <- c(
"CD25", "CD4", "FOXP3", "CCR4", "CCR6")
plot_feature_plots(BoneMarrow1_pbmc_UMAP,
Regulatory_T_classic_gene_maker_names)$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000183813 |CCR4 |
|ENSG00000112486 |CCR6 |
|ENSG00000010610 |CD4 |
|ENSG00000049768 |FOXP3 |
$feature_plot
$violin_plot
- Macrophage Cell: ARG, CCL2, CD163, CD206, FIZZ1, IL-10, MRC1, CD163, CD206
Macrophage_Cell_classic_gene_maker_names <- c(
"ARG", "CCL2", "CD163", "CD206", "FIZZ1", "IL-10", "MRC1", "CD163", "CD206", "CD14")
plot_feature_plots(BoneMarrow1_pbmc_UMAP,
Macrophage_Cell_classic_gene_maker_names)$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000108691 |CCL2 |
|ENSG00000170458 |CD14 |
|ENSG00000177575 |CD163 |
|ENSG00000260314 |MRC1 |
$feature_plot
$violin_plot
- NK Cell: TRDC, KLRF1, GNLY, NCR1, GZMA, HOPX
NK_Cell_classic_gene_maker_names <- c(
"TRDC", "KLRF1", "GNLY", "NCR1", "GZMA", "HOPX")
plot_feature_plots(BoneMarrow1_pbmc_UMAP,
NK_Cell_classic_gene_maker_names)$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000115523 |GNLY |
|ENSG00000145649 |GZMA |
|ENSG00000171476 |HOPX |
|ENSG00000150045 |KLRF1 |
|ENSG00000275521 |NCR1 |
|ENSG00000278025 |NCR1 |
|ENSG00000277442 |NCR1 |
|ENSG00000277334 |NCR1 |
|ENSG00000276450 |NCR1 |
|ENSG00000277629 |NCR1 |
|ENSG00000273506 |NCR1 |
|ENSG00000275637 |NCR1 |
|ENSG00000277824 |NCR1 |
|ENSG00000275822 |NCR1 |
|ENSG00000274053 |NCR1 |
|ENSG00000278362 |NCR1 |
|ENSG00000273916 |NCR1 |
|ENSG00000273535 |NCR1 |
|ENSG00000275156 |NCR1 |
|ENSG00000189430 |NCR1 |
|ENSG00000211829 |TRDC |
$feature_plot
$violin_plot
- Neutrophil Precursors: S100A8, S100A9, S100A12, LCN2, LTF
Neutrophil_Precursors_classic_gene_maker_names <- c("S100A8", "S100A9", "S100A12", "LCN2", "LTF")
plot_feature_plots(BoneMarrow1_pbmc_UMAP,
Neutrophil_Precursors_classic_gene_maker_names)$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000148346 |LCN2 |
|ENSG00000012223 |LTF |
|ENSG00000163221 |S100A12 |
|ENSG00000143546 |S100A8 |
|ENSG00000163220 |S100A9 |
$feature_plot
$violin_plot
- CD8 T Cell: CD3, CD5, CD8, CD27, CD28
CD8_T_Cell_classic_gene_maker_names <- c("CD3", "CD5", "CD8", "CD27", "CD28")
plot_feature_plots(BoneMarrow1_pbmc_UMAP,
CD8_T_Cell_classic_gene_maker_names)$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000139193 |CD27 |
|ENSG00000178562 |CD28 |
|ENSG00000110448 |CD5 |
$feature_plot
$violin_plot
- Naïve B Cell: P2RX5, PTPRCAP
Naive_B_Cell_classic_gene_maker_names <- c("P2RX5", "PTPRCAP")
plot_feature_plots(BoneMarrow1_pbmc_UMAP,
Naive_B_Cell_classic_gene_maker_names)$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000083454 |P2RX5 |
|ENSG00000213402 |PTPRCAP |
$feature_plot
$violin_plot
- Megakaryocyte Cell: IL6ST
Megakaryocyte_Cell_classic_gene_maker_names <- c("IL6ST")
plot_feature_plots(BoneMarrow1_pbmc_UMAP,
Megakaryocyte_Cell_classic_gene_maker_names)$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000134352 |IL6ST |
$feature_plot
$violin_plot
- Mucosal-associated Invariant T Cell: TRAV1-2, TRAJ33, TRAJ20, TRAJ12, SLC4A10
Mucosal_associated_Invariant_T_Cell_classic_gene_maker_names <- c("TRAV1-2", "TRAJ33", "TRAJ20", "TRAJ12", "SLC4A10")
plot_feature_plots(BoneMarrow1_pbmc_UMAP,
Mucosal_associated_Invariant_T_Cell_classic_gene_maker_names)$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000144290 |SLC4A10 |
|ENSG00000211877 |TRAJ12 |
|ENSG00000211869 |TRAJ20 |
|ENSG00000211856 |TRAJ33 |
|ENSG00000256553 |TRAV1-2 |
$feature_plot
$violin_plot
- Plasmacytoid Dendritic Cell: BDCA2, BDCA-4, CD123, CD303, CD304, CLEC4C, DR6, Fc-epsilon RI-alpha, ILT3, ILT7, NRP1,CD40, CD80, CD86,
Plasmacytoid_Dendritic_Cell_classic_gene_maker_names <- c(
"BDCA2", "BDCA-4", "CD123", "CD303", "CD304", "CLEC4C", "DR6", "Fc-epsilon RI-alpha", "ILT3", "ILT7", "NRP1")
plot_feature_plots(BoneMarrow1_pbmc_UMAP,
Plasmacytoid_Dendritic_Cell_classic_gene_maker_names)$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000198178 |CLEC4C |
|ENSG00000099250 |NRP1 |
$feature_plot
$violin_plot
- Erythroid Cell: AHSP, HBA-A1, HBA-A2, HIST1H2AO, STFA1, ALAD, PNPO
Erythroid_Cell_classic_gene_maker_names <- c("AHSP", "HBA-A1", "HBA-A2", "HIST1H2AO", "STFA1","ALAD", "PNPO")
plot_feature_plots(BoneMarrow1_pbmc_UMAP,
Erythroid_Cell_classic_gene_maker_names)$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000169877 |AHSP |
|ENSG00000148218 |ALAD |
|ENSG00000108439 |PNPO |
$feature_plot
$violin_plot
- Hematopoietic Stem Cell: CD34, MECOM, EGR1
Hematopoietic_Stem_Cell_classic_gene_maker_names <- c("CD34", "MECOM", "EGR1")
plot_feature_plots(BoneMarrow1_pbmc_UMAP,
Hematopoietic_Stem_Cell_classic_gene_maker_names)$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000174059 |CD34 |
|ENSG00000120738 |EGR1 |
|ENSG00000085276 |MECOM |
$feature_plot
$violin_plot
- Dendritic Cell: CD1C, CD86
Dendritic_Cell_classic_gene_maker_names <- c(
"CD1C", "CD86")
plot_feature_plots(BoneMarrow1_pbmc_UMAP,
Dendritic_Cell_classic_gene_maker_names)$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000158481 |CD1C |
|ENSG00000114013 |CD86 |
$feature_plot
$violin_plot
- Non-classical Monocyte Cell: CD14, CD16, CX3CR1, LYPD2
Non_classical_Monocyte_Cell_classic_gene_maker_names <- c(
"CD14", "CD16", "LYPD2")
plot_feature_plots(BoneMarrow1_pbmc_UMAP,
Non_classical_Monocyte_Cell_classic_gene_maker_names)$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000170458 |CD14 |
|ENSG00000197353 |LYPD2 |
$feature_plot
$violin_plot
- Plasma Cell: MZB1, IGHG1, JCHAIN, SPAG4, TGM5
Plasma_Cell_classic_gene_maker_names <- c("MZB1", "IGHG1", "JCHAIN", "SPAG4", "TGM5")
plot_feature_plots(BoneMarrow1_pbmc_UMAP,
Plasma_Cell_classic_gene_maker_names)$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000277633 |IGHG1 |
|ENSG00000211896 |IGHG1 |
|ENSG00000132465 |JCHAIN |
|ENSG00000170476 |MZB1 |
|ENSG00000061656 |SPAG4 |
|ENSG00000104055 |TGM5 |
$feature_plot
$violin_plot
Annotate the cell types for the BoneMarrow1 data
# Annotate the cell types for the BoneMarrow1 data
# manually rename the seurat_clusters to the cell types
BoneMarrow1_pbmc_UMAP_new_meta <- BoneMarrow1_pbmc_UMAP@meta.data |>
# change the seurat_clusters to cell types
dplyr::mutate(cell_type = case_when(
seurat_clusters == 0 ~ "Regulatory T-cell",
seurat_clusters == 1 ~ "Macrophage Cell",
seurat_clusters == 2 ~ "Natural Killer Cell",
seurat_clusters == 3 ~ "Neutrophil Precursors",
seurat_clusters == 4 ~ "CD8 T cell",
seurat_clusters == 5 ~ "Naïve B cell",
seurat_clusters == 6 ~ "Natural Killer Cell",
seurat_clusters == 7 ~ "Megakaryocyte Cell",
seurat_clusters == 8 ~ "Macrophage Cell",
seurat_clusters == 9 ~ "CD8 T cell",
seurat_clusters == 10 ~ "Mucosal-associated Invariant T Cell",
seurat_clusters == 11 ~ "Plasmacytoid Dendritic Cell",
seurat_clusters == 12 ~ "Erythroid Cells",
seurat_clusters == 13 ~ "Hematopoietic Stem Cells",
seurat_clusters == 14 ~ "CD8 T cell",
seurat_clusters == 15 ~ "Dendritic Cell",
seurat_clusters == 16 ~ "Non-classical Monocyte Cell",
seurat_clusters == 17 ~ "Plasma Cell")) |>
dplyr::mutate(cell_type = as.factor(cell_type))
# add the cell types to the UMAP object
BoneMarrow1_pbmc_UMAP <- AddMetaData(
BoneMarrow1_pbmc_UMAP,
metadata = BoneMarrow1_pbmc_UMAP_new_meta,
col.name = "cell_type")
# export the UMAP object to the RDS file
saveRDS(BoneMarrow1_pbmc_UMAP, "BoneMarrow1_pbmc_UMAP.rds")2.7.1.1 Summary Plots
Plot the UMAP with the cell types for the BoneMarrow1 data
DimPlot(BoneMarrow1_pbmc_UMAP,
reduction = "umap",
label = TRUE,
group.by = "cell_type",
label.size = 3,
repel = TRUE,
alpha = 0.5) +
ggtitle("BoneMarrow1 Cell Types") +
NoLegend()Plot the heatmap
DoHeatmap(
BoneMarrow1_pbmc_UMAP,
features = BoneMarrow1_pbmc_markers_top_5$gene,
label = TRUE,
group.by = "cell_type",
size = 4) |>
suppressWarnings() +
NoLegend() +
ggtitle("BoneMarrow1 Heatmap") +
rremove("y.text") 2.7.2 BoneMarrow2 data
# I decided to use the UMAP for the BoneMarrow2 data
# to find the gene markers
if (file.exists("BoneMarrow2_pbmc_markers.rds")) {
BoneMarrow2_pbmc_markers <- readRDS("BoneMarrow2_pbmc_markers.rds")
} else {
BoneMarrow2_pbmc_markers <-
FindAllMarkers(
BoneMarrow2_pbmc_UMAP,
min.pct = 0.01,
logfc.threshold = 0.01,
test.use = "wilcox",
verbose = FALSE)
saveRDS(BoneMarrow2_pbmc_markers, "BoneMarrow2_pbmc_markers.rds")
}
BoneMarrow2_pbmc_markers_top_5 <-
BoneMarrow2_pbmc_markers |>
as_tibble() |>
filter(avg_log2FC > 4) |>
filter(p_val_adj < 0.01) |>
group_by(cluster, p_val_adj) |>
# arrange the gene with the order of
# high in avg_log2FC and low in p_val_adj
arrange(cluster, p_val_adj, desc(avg_log2FC)) |>
mutate(gene_names = map_chr(
gene, ~ BoneMarrow_data2_IDs$hgnc_symbol[
match(.x, BoneMarrow_data2_IDs$ensembl_gene_id)])) |>
# select the top 5 genes in each cluster
group_by(cluster) |>
dplyr::slice(1:5) |>
dplyr::select(cluster, gene_names, everything())
# export the top 5 highly expressed genes each cluster to a xlsx file
if (!file.exists("BoneMarrow2_pbmc_markers_top_5.xlsx")) {
write_xlsx(BoneMarrow2_pbmc_markers_top_5,
"BoneMarrow2_pbmc_markers_top_5.xlsx")
}
BoneMarrow2_pbmc_markers_top_5 |>
knitr::kable()| cluster | gene_names | p_val | avg_log2FC | pct.1 | pct.2 | p_val_adj | gene |
|---|---|---|---|---|---|---|---|
| 0 | S100A12 | 0 | 8.700578 | 0.897 | 0.011 | 0e+00 | ENSG00000163221 |
| 0 | S100A8 | 0 | 7.696677 | 0.993 | 0.093 | 0e+00 | ENSG00000143546 |
| 0 | S100A9 | 0 | 7.390087 | 0.998 | 0.138 | 0e+00 | ENSG00000163220 |
| 0 | CD14 | 0 | 7.300052 | 0.676 | 0.009 | 0e+00 | ENSG00000170458 |
| 0 | CLEC4E | 0 | 7.265346 | 0.657 | 0.007 | 0e+00 | ENSG00000166523 |
| 1 | CCR4 | 0 | 5.170545 | 0.306 | 0.008 | 0e+00 | ENSG00000183813 |
| 1 | TTC39C-AS1 | 0 | 4.442917 | 0.097 | 0.004 | 0e+00 | ENSG00000264745 |
| 1 | FOXP3 | 0 | 4.069459 | 0.062 | 0.004 | 0e+00 | ENSG00000049768 |
| 1 | PI16 | 0 | 5.528891 | 0.039 | 0.000 | 0e+00 | ENSG00000164530 |
| 1 | NEFL | 0 | 4.593021 | 0.043 | 0.001 | 0e+00 | ENSG00000277586 |
| 2 | KLRC2 | 0 | 4.488809 | 0.642 | 0.057 | 0e+00 | ENSG00000205809 |
| 2 | TKTL1 | 0 | 5.667839 | 0.362 | 0.017 | 0e+00 | ENSG00000007350 |
| 2 | KLRC3 | 0 | 4.342381 | 0.345 | 0.020 | 0e+00 | ENSG00000205810 |
| 2 | IL5RA | 0 | 4.732259 | 0.072 | 0.003 | 0e+00 | ENSG00000091181 |
| 3 | MYOM2 | 0 | 4.906103 | 0.886 | 0.055 | 0e+00 | ENSG00000036448 |
| 3 | FGFBP2 | 0 | 4.041734 | 0.905 | 0.100 | 0e+00 | ENSG00000137441 |
| 3 | SH2D1B | 0 | 4.343918 | 0.649 | 0.045 | 0e+00 | ENSG00000198574 |
| 3 | LINC00299 | 0 | 4.531001 | 0.431 | 0.025 | 0e+00 | ENSG00000236790 |
| 3 | PRSS23 | 0 | 4.230768 | 0.455 | 0.034 | 0e+00 | ENSG00000150687 |
| 4 | TNFRSF9 | 0 | 4.412879 | 0.325 | 0.018 | 0e+00 | ENSG00000049249 |
| 4 | ASTL | 0 | 4.889529 | 0.067 | 0.002 | 0e+00 | ENSG00000188886 |
| 4 | VCAM1 | 0 | 6.889529 | 0.029 | 0.000 | 0e+00 | ENSG00000162692 |
| 4 | ZNF80 | 0 | 5.889529 | 0.019 | 0.000 | 1e-07 | ENSG00000174255 |
| 5 | LRRN3 | 0 | 4.684415 | 0.187 | 0.006 | 0e+00 | ENSG00000173114 |
| 6 | SLC4A10 | 0 | 7.301805 | 0.395 | 0.003 | 0e+00 | ENSG00000144290 |
| 6 | TRAV1-2 | 0 | 4.486837 | 0.238 | 0.009 | 0e+00 | ENSG00000256553 |
| 6 | LINC01644 | 0 | 5.420450 | 0.195 | 0.004 | 0e+00 | ENSG00000218357 |
| 6 | TSPAN15 | 0 | 4.319363 | 0.157 | 0.008 | 0e+00 | ENSG00000099282 |
| 6 | IL23R | 0 | 5.216916 | 0.114 | 0.003 | 0e+00 | ENSG00000162594 |
| 7 | LILRA4 | 0 | 9.930713 | 0.975 | 0.005 | 0e+00 | ENSG00000239961 |
| 7 | LINC02812 | 0 | 9.230507 | 0.787 | 0.002 | 0e+00 | ENSG00000230138 |
| 7 | CLEC4C | 0 | 8.741929 | 0.861 | 0.003 | 0e+00 | ENSG00000198178 |
| 7 | LRRC26 | 0 | 8.675292 | 0.672 | 0.003 | 0e+00 | ENSG00000184709 |
| 7 | PROC | 0 | 8.369202 | 0.787 | 0.004 | 0e+00 | ENSG00000115718 |
| 8 | CD1C | 0 | 6.120903 | 0.286 | 0.010 | 0e+00 | ENSG00000158481 |
| 8 | CAVIN2 | 0 | 5.834204 | 0.321 | 0.019 | 0e+00 | ENSG00000168497 |
| 8 | CLEC10A | 0 | 5.894971 | 0.188 | 0.003 | 0e+00 | ENSG00000132514 |
| 8 | ENHO | 0 | 6.330357 | 0.152 | 0.002 | 0e+00 | ENSG00000168913 |
| 8 | C19orf33 | 0 | 7.107964 | 0.134 | 0.001 | 0e+00 | ENSG00000167644 |
| 9 | CD79A | 0 | 8.333834 | 0.958 | 0.021 | 0e+00 | ENSG00000105369 |
| 9 | MS4A1 | 0 | 7.611297 | 0.832 | 0.015 | 0e+00 | ENSG00000156738 |
| 9 | CD19 | 0 | 8.874331 | 0.568 | 0.002 | 0e+00 | ENSG00000177455 |
| 9 | FCER2 | 0 | 7.647739 | 0.653 | 0.006 | 0e+00 | ENSG00000104921 |
| 9 | FCRL1 | 0 | 7.857457 | 0.568 | 0.003 | 0e+00 | ENSG00000163534 |
| 10 | CDKN1C | 0 | 7.368670 | 0.603 | 0.004 | 0e+00 | ENSG00000129757 |
| 10 | CKB | 0 | 8.221113 | 0.562 | 0.003 | 0e+00 | ENSG00000166165 |
| 10 | LYPD2 | 0 | 13.087031 | 0.438 | 0.000 | 0e+00 | ENSG00000197353 |
| 10 | VMO1 | 0 | 9.435237 | 0.438 | 0.001 | 0e+00 | ENSG00000182853 |
| 10 | UICLM | 0 | 8.747181 | 0.438 | 0.001 | 0e+00 | ENSG00000233392 |
| 11 | PRTN3 | 0 | 9.439529 | 0.952 | 0.019 | 0e+00 | ENSG00000196415 |
| 11 | C1QTNF4 | 0 | 9.376598 | 0.629 | 0.003 | 0e+00 | ENSG00000172247 |
| 11 | SMIM24 | 0 | 7.842478 | 0.710 | 0.006 | 0e+00 | ENSG00000095932 |
| 11 | CTSG | 0 | 7.899165 | 0.790 | 0.010 | 0e+00 | ENSG00000100448 |
| 11 | MS4A3 | 0 | 8.151139 | 0.581 | 0.002 | 0e+00 | ENSG00000149516 |
| 12 | SPTSSB | 0 | 6.160634 | 0.500 | 0.010 | 0e+00 | ENSG00000196542 |
| 12 | XCL1 | 0 | 4.973271 | 0.871 | 0.051 | 0e+00 | ENSG00000143184 |
| 12 | KLRC1 | 0 | 4.814189 | 0.613 | 0.035 | 0e+00 | ENSG00000134545 |
| 12 | TNFRSF11A | 0 | 5.751829 | 0.194 | 0.004 | 0e+00 | ENSG00000141655 |
| 12 | ADGRG3 | 0 | 4.967806 | 0.226 | 0.008 | 0e+00 | ENSG00000182885 |
By looking up the highly expressed genes in each cluster online and in the literature, I can roughly identify the cell types in the BoneMarrow2 data. I found the following cell types:
| Cluster | Cell Type |
|---|---|
| 0 | Neutrophil |
| 1 | Regulatory T-cell |
| 2 | Natural Killer Cell |
| 3 | CD16 Natural Killer Cell |
| 4 | CD8 T cell |
| 5 | Regulatory T-cell |
| 6 | Mucosal-associated Invariant |
| 7 | Plasmacytoid Dendritic Cell |
| 8 | Dendritic Cell |
| 9 | Naïve B cell |
| 10 | Non-classical Monocyte Cell |
| 11 | Neutrophil Granulocytes |
| 12 | Natural Killer Cell |
Since I already have the primary cell types for the BoneMarrow2 data from the top 5 highly expressed genes in each cluster, we can verfiy the cell types by gene markers from the literature or from online databases (e.g., CellMarker 2.0, PanglaoDB)
And I found the following gene markers for the cell types in the BoneMarrow2 data:
| Cell Type | Gene Markers |
|---|---|
| Neutrophil | S100A8, S100A9, S100A12, LCN2, LTF |
| Regulatory T-cell | CD25, CD4, FOXP3, CCR4, CCR6 |
| Natural Killer Cell | TRDC, KLRF1, GNLY, NCR1, GZMA, HOPX |
| CD16 Natural Killer Cell | MYOM2, FGFBP2, SH2D1B |
| CD8 T cell | CD3, CD5, CD8, CD27, CD28 |
| Mucosal-associated Invariant T Cell | TRAV1-2, TRAJ33, TRAJ20, TRAJ12, SLC4A10 |
| Plasmacytoid Dendritic Cell | BDCA2, BDCA-4, CD123, CD303, CD304, CLEC4C, DR6, Fc-epsilon RI-alpha, ILT3, ILT7, NRP1 |
| Dendritic Cell | CD1C, CD86 |
| Naïve B cell | MS4A1, CD79A, P2RX5, PTPRCAP |
| Non-classical Monocyte Cell | CD14, CD16, CX3CR1, LYPD2 |
| Neutrophil Granulocytes | S100A8, S100A9, S100A12, LCN2, LTF |
Then plot feature plots for the classic gene markers for each cell type to verify the cell types in the BoneMarrow2 data.
- Neutrophil: S100A8, S100A9, S100A12, LCN2, LTF
Neutrophil_markers_bone_marrow2 <- c("S100A8", "S100A9", "S100A12", "LCN2", "LTF")
plot_feature_plots(BoneMarrow2_pbmc_UMAP,
Neutrophil_markers_bone_marrow2)$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000148346 |LCN2 |
|ENSG00000012223 |LTF |
|ENSG00000163221 |S100A12 |
|ENSG00000143546 |S100A8 |
|ENSG00000163220 |S100A9 |
$feature_plot
$violin_plot
- Regulatory T-cell: CD25, CD4, FOXP3, CCR4, CCR6
Regulatory_T_cell_markers_bone_marrow2 <- c("CD25", "CD4", "FOXP3", "CCR4", "CCR6")
plot_feature_plots(BoneMarrow2_pbmc_UMAP,
Regulatory_T_cell_markers_bone_marrow2)$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000183813 |CCR4 |
|ENSG00000112486 |CCR6 |
|ENSG00000010610 |CD4 |
|ENSG00000049768 |FOXP3 |
$feature_plot
$violin_plot
- Natural Killer Cell: TRDC, KLRF1, GNLY, NCR1, GZMA, HOPX
Natural_Killer_Cell_markers_bone_marrow2 <- c("TRDC", "KLRF1", "GNLY", "NCR1", "GZMA", "HOPX")
plot_feature_plots(BoneMarrow2_pbmc_UMAP,
Natural_Killer_Cell_markers_bone_marrow2)$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000115523 |GNLY |
|ENSG00000145649 |GZMA |
|ENSG00000171476 |HOPX |
|ENSG00000150045 |KLRF1 |
|ENSG00000275521 |NCR1 |
|ENSG00000278025 |NCR1 |
|ENSG00000277442 |NCR1 |
|ENSG00000277334 |NCR1 |
|ENSG00000276450 |NCR1 |
|ENSG00000277629 |NCR1 |
|ENSG00000273506 |NCR1 |
|ENSG00000275637 |NCR1 |
|ENSG00000277824 |NCR1 |
|ENSG00000275822 |NCR1 |
|ENSG00000274053 |NCR1 |
|ENSG00000278362 |NCR1 |
|ENSG00000273916 |NCR1 |
|ENSG00000273535 |NCR1 |
|ENSG00000275156 |NCR1 |
|ENSG00000189430 |NCR1 |
|ENSG00000211829 |TRDC |
$feature_plot
$violin_plot
- CD16 Natural Killer Cell: MYOM2, FGFBP2, SH2D1B
CD16_Natural_Killer_Cell_markers_bone_marrow2 <- c("MYOM2", "FGFBP2", "SH2D1B")
plot_feature_plots(BoneMarrow2_pbmc_UMAP,
CD16_Natural_Killer_Cell_markers_bone_marrow2)$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000137441 |FGFBP2 |
|ENSG00000274137 |MYOM2 |
|ENSG00000036448 |MYOM2 |
|ENSG00000198574 |SH2D1B |
$feature_plot
$violin_plot
- CD8 T cell: CD3, CD5, CD8, CD27, CD28
CD8_T_cell_markers_bone_marrow2 <- c("CD3", "CD5", "CD8", "CD27", "CD28")
plot_feature_plots(BoneMarrow2_pbmc_UMAP,
CD8_T_cell_markers_bone_marrow2)$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000139193 |CD27 |
|ENSG00000178562 |CD28 |
|ENSG00000110448 |CD5 |
$feature_plot
$violin_plot
- Mucosal-associated Invariant T Cell: TRAV1-2, TRAJ33, TRAJ20, TRAJ12, SLC4A10
Mucosal_associated_Invariant_T_Cell_markers_bone_marrow2 <- c("TRAV1-2", "TRAJ33", "TRAJ20", "TRAJ12", "SLC4A10")
plot_feature_plots(BoneMarrow2_pbmc_UMAP,
Mucosal_associated_Invariant_T_Cell_markers_bone_marrow2)$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000144290 |SLC4A10 |
|ENSG00000211877 |TRAJ12 |
|ENSG00000211869 |TRAJ20 |
|ENSG00000211856 |TRAJ33 |
|ENSG00000256553 |TRAV1-2 |
$feature_plot
$violin_plot
- Plasmacytoid Dendritic Cell: BDCA2, BDCA-4, CD123, CD303, CD304, CLEC4C, DR6, Fc-epsilon RI-alpha, ILT3, ILT7, NRP1
Plasmacytoid_Dendritic_Cell_markers_bone_marrow2 <- c("BDCA2", "BDCA-4", "CD123", "CD303", "CD304", "CLEC4C", "DR6", "Fc-epsilon RI-alpha", "ILT3", "ILT7", "NRP1")
plot_feature_plots(BoneMarrow2_pbmc_UMAP,
Plasmacytoid_Dendritic_Cell_markers_bone_marrow2)$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000198178 |CLEC4C |
|ENSG00000099250 |NRP1 |
$feature_plot
$violin_plot
- Dendritic Cell: CD1C, CD86
Dendritic_Cell_markers_bone_marrow2 <- c("CD1C", "CD86")
plot_feature_plots(BoneMarrow2_pbmc_UMAP,
Dendritic_Cell_markers_bone_marrow2)$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000158481 |CD1C |
|ENSG00000114013 |CD86 |
$feature_plot
$violin_plot
- Naïve B cell: MS4A1, CD79A, P2RX5
Naive_B_cell_markers_bone_marrow2 <- c("MS4A1", "CD79A", "P2RX5")
plot_feature_plots(BoneMarrow2_pbmc_UMAP,
Naive_B_cell_markers_bone_marrow2)$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000105369 |CD79A |
|ENSG00000156738 |MS4A1 |
|ENSG00000083454 |P2RX5 |
$feature_plot
$violin_plot
Annotate the cell types for the BoneMarrow2 data
# Annotate the cell types for the BoneMarrow2 data
# manually rename the seurat_clusters to the cell types
BoneMarrow2_pbmc_UMAP_new_meta <- BoneMarrow2_pbmc_UMAP@meta.data |>
# change the seurat_clusters to cell types
dplyr::mutate(cell_type = case_when(
seurat_clusters == 0 ~ "Neutrophil",
seurat_clusters == 1 ~ "Regulatory T-cell",
seurat_clusters == 2 ~ "Natural Killer Cell",
seurat_clusters == 3 ~ "CD16 Natural Killer Cell",
seurat_clusters == 4 ~ "CD8 T Cell",
seurat_clusters == 5 ~ "Regulatory T-cell",
seurat_clusters == 6 ~ "Mucosal-associated Invariant T Cell",
seurat_clusters == 7 ~ "Plasmacytoid Dendritic Cell",
seurat_clusters == 8 ~ "Dendritic Cell",
seurat_clusters == 9 ~ "Naïve B Cell",
seurat_clusters == 10 ~ "Non-classical Monocyte Cell",
seurat_clusters == 11 ~ "Neutrophil Granulocytes",
seurat_clusters == 12 ~ "Natural Killer Cell")) |>
dplyr::mutate(cell_type = as.factor(cell_type))
# add the cell types to the UMAP object
BoneMarrow2_pbmc_UMAP <- AddMetaData(
BoneMarrow2_pbmc_UMAP,
metadata = BoneMarrow2_pbmc_UMAP_new_meta,
col.name = "cell_type")
# export the UMAP object to a rds file
saveRDS(BoneMarrow2_pbmc_UMAP, "BoneMarrow2_pbmc_UMAP.rds")2.7.2.1 Summary Plots
Plot the UMAP with the cell types for the BoneMarrow2 data
DimPlot(BoneMarrow2_pbmc_UMAP,
reduction = "umap",
label = TRUE,
group.by = "cell_type",
label.size = 3,
repel = TRUE,
alpha = 0.5) +
ggtitle("BoneMarrow2 Cell Types") +
NoLegend()Plot the heatmap
DoHeatmap(
BoneMarrow2_pbmc_UMAP,
features = BoneMarrow2_pbmc_markers_top_5$gene,
label = TRUE,
group.by = "cell_type",
angle = 23,
size = 4) |>
suppressWarnings() +
NoLegend() +
ggtitle("BoneMarrow2 Heatmap") +
rremove("y.text") 3 Part 2: Pancreas
3.1 Read the data
# Load the Pancreas dataset
Pancreas <- readRDS(
paste("/Users/jacenai/Desktop/STAT_M254/",
"Final_Project/Datasets_final/",
"Pancreas.rds",
sep = "")) |>
as.data.frame()3.2 Create Seurat and QC
# Create the Pancreas Seurat object
Pancreas <- Seurat::CreateSeuratObject(
counts = Pancreas,
project = "Pancreas",
assay = "DNA",
min.cells = 10,
min.features = 200) |>
suppressWarnings()
# Find mitochondrial genes for Pancreas
Pancreas[["percent.mt"]] <- PercentageFeatureSet(
Pancreas,
pattern = "^MT-") |>
suppressWarnings()
VlnPlot(Pancreas,
features = c("nFeature_DNA",
"nCount_DNA",
"percent.mt"),
ncol = 3) |>
suppressWarnings()3.3 Transformation & Feature selection & Scaling
I will try log1pCPM, and SCTransform to transform the data.
# Transform the pancreas data
# Seurat default log-normalization (log1pCP10k)
pancreas_log1pCP1k_1 <- NormalizeData(
Pancreas,
normalization.method = "LogNormalize",
scale.factor = 1000)
# identify the highly variable genes
pancreas_features_log1pCP1k_1 <- FindVariableFeatures(
pancreas_log1pCP1k_1,
selection.method = "vst",
nfeatures = 2000) # tune this parameter
# Scale the data
pancreas_scaled_log1pCP1k_1 <-
ScaleData(pancreas_features_log1pCP1k_1,
features =
VariableFeatures(pancreas_features_log1pCP1k_1))
# SCTransform the Pancreas data
pancreas_SCTransform <- SCTransform(
Pancreas,
verbose = FALSE,
variable.features.n = 3000, # tune this parameter
return.only.var.genes = TRUE,
assay = "DNA",
vars.to.regress = c("nCount_DNA", "percent.mt"))Calculate the variance of transformed data and plot it against the mean.
# Calculate the variance of the transformed data
pancreas_variance_log1pCP1k_1 <-
pancreas_scaled_log1pCP1k_1[["DNA"]]@layers$scale.data |>
apply(1, var)
pancreas_mean_log1pCP1k_1 <-
pancreas_scaled_log1pCP1k_1[["DNA"]]@layers$scale.data |>
apply(1, mean)
pancreas_variance_SCTransform <-
pancreas_SCTransform[["SCT"]]@scale.data |>
apply(1, var)
pancreas_mean_SCTransform <-
pancreas_SCTransform[["SCT"]]@scale.data |>
apply(1, mean)Plot the variance against the mean
# Plot the variance against the mean
# Create a data frame for plotting
gene_var_mean_df <-
data.frame(Gene_Variance =
c(pancreas_variance_log1pCP1k_1,
pancreas_variance_SCTransform),
Gene_Mean =
c(pancreas_mean_log1pCP1k_1,
pancreas_mean_SCTransform),
Transformation =
rep(c("log1pCP1k", "SCTransform"),
c(length(pancreas_variance_log1pCP1k_1),
length(pancreas_variance_SCTransform))))
# Plot the variance against the mean
ggplot(gene_var_mean_df,
aes(x = Gene_Mean,
y = Gene_Variance,
color = Transformation)) +
geom_point(alpha = 0.5) +
facet_wrap(~Transformation,
scales = "free") +
theme_gray() +
labs(title = "Variance vs Mean",
x = "Mean",
y = "Variance") +
theme(legend.position = "none") +
scale_x_continuous(breaks = scales::pretty_breaks(n = 5))I decided to use SCTransform for the downstream analysis.
3.4 Linear dimensional reduction
# I decided to use SCTransform for the downstream analysis
# Perform PCA on the Pancreas data
Pancreas_pbmc_PCA <- RunPCA(
pancreas_SCTransform,
features = VariableFeatures(pancreas_SCTransform),
npcs = 100,
verbose = FALSE)
ElbowPlot(Pancreas_pbmc_PCA, ndims = 100)3.5 Get the gene names
Ensembl IDs to gene
if (file.exists("Pancreas_data_IDs.rds")) {
Pancreas_data_IDs <- readRDS("Pancreas_data_IDs.rds")
} else {
Pancreas_data_IDs <- getBM(
attributes = c("ensembl_gene_id","hgnc_symbol"),
filters = 'ensembl_gene_id',
values = rownames(Pancreas),
mart = mrt)
saveRDS(Pancreas_data_IDs, "Pancreas_data_IDs.rds")
}3.6 Clustering
# Perform neighborhood-based clustering
Pancreas_pbmc_neighbor <- FindNeighbors(
Pancreas_pbmc_PCA,
reduction = "pca",
dims = 1:100,
k.param = 30,
prune.SNN = 1/15,
verbose = FALSE)
Pancreas_pbmc_cluster <- FindClusters(
Pancreas_pbmc_neighbor,
resolution = 0.5 ,
verbose = FALSE)Run UMAP and t-SNE
# Run UMAP
Pancreas_pbmc_UMAP <- RunUMAP(
Pancreas_pbmc_cluster,
dims = 1:100,
verbose = FALSE)
# try t_SNE
Pancreas_pbmc_tSNE <- RunTSNE(
Pancreas_pbmc_cluster,
dims = 1:90,
verbose = FALSE)Plot the UMAP and t-SNE
DimPlot(Pancreas_pbmc_UMAP,
reduction = "umap",
label = TRUE,
group.by = "seurat_clusters",
alpha = 0.5) +
ggtitle("Pancreas UMAP")DimPlot(Pancreas_pbmc_tSNE,
reduction = "tsne",
label = TRUE,
group.by = "seurat_clusters",
alpha = 0.5) +
ggtitle("Pancreas t-SNE")By comparing the UMAP and t-SNE plots, I decided to use the UMAP.
3.7 Annotation Results
# Find gene markers
if (file.exists("pancreas_gene_markers.rds")) {
pancreas_gene_markers <- readRDS("pancreas_gene_markers.rds")
} else {
pancreas_gene_markers <- FindAllMarkers(
Pancreas_pbmc_UMAP,
min.pct = 0.01,
logfc.threshold = 0.01,
test.use = "wilcox",
verbose = FALSE)
saveRDS(pancreas_gene_markers, "pancreas_gene_markers.rds")
}
# Filter the gene markers
Pancreas_pbmc_markers_top_5 <-
pancreas_gene_markers |>
as_tibble() |>
filter(avg_log2FC > 2) |>
filter(p_val_adj < 0.01) |>
group_by(cluster, p_val_adj) |>
# arrange the gene with the order of
# high in avg_log2FC and low in p_val_adj
arrange(cluster, p_val_adj, desc(avg_log2FC)) |>
mutate(gene_names = map_chr(
gene, ~ Pancreas_data_IDs$hgnc_symbol[
match(.x, Pancreas_data_IDs$ensembl_gene_id)])) |>
# select the top 5 genes in each cluster
group_by(cluster) |>
dplyr::slice(1:5) |>
dplyr::select(cluster, gene_names, everything())
Pancreas_pbmc_markers_top_5 |>
as_tibble() |>
knitr::kable()| cluster | gene_names | p_val | avg_log2FC | pct.1 | pct.2 | p_val_adj | gene |
|---|---|---|---|---|---|---|---|
| 0 | GATM | 0 | 2.314042 | 1.000 | 0.817 | 0 | ENSG00000171766 |
| 0 | SYCN | 0 | 2.056717 | 1.000 | 0.969 | 0 | ENSG00000179751 |
| 0 | AMY2A | 0 | 2.232347 | 0.998 | 0.994 | 0 | ENSG00000243480 |
| 0 | AQP8 | 0 | 2.280071 | 0.948 | 0.561 | 0 | ENSG00000103375 |
| 0 | AMY2B | 0 | 2.190193 | 0.993 | 0.883 | 0 | ENSG00000240038 |
| 1 | REG3G | 0 | 4.080445 | 1.000 | 0.844 | 0 | ENSG00000143954 |
| 1 | REG1B | 0 | 3.215856 | 1.000 | 1.000 | 0 | ENSG00000172023 |
| 1 | REG1A | 0 | 2.081271 | 1.000 | 1.000 | 0 | ENSG00000115386 |
| 1 | SERPINA3 | 0 | 3.717401 | 0.534 | 0.083 | 0 | ENSG00000196136 |
| 1 | C3 | 0 | 3.439010 | 0.685 | 0.193 | 0 | ENSG00000125730 |
| 2 | GYPC | 0 | 3.637994 | 0.741 | 0.105 | 0 | ENSG00000136732 |
| 2 | DAB2 | 0 | 4.486262 | 0.553 | 0.049 | 0 | ENSG00000153071 |
| 2 | S100A13 | 0 | 2.685883 | 0.962 | 0.452 | 0 | ENSG00000189171 |
| 2 | AQP1 | 0 | 4.018850 | 0.662 | 0.103 | 0 | ENSG00000240583 |
| 2 | CAVIN1 | 0 | 2.775235 | 0.700 | 0.100 | 0 | ENSG00000177469 |
| 3 | HSD11B1-AS1 | 0 | 3.592255 | 0.329 | 0.034 | 0 | ENSG00000227591 |
| 3 | LINC01610 | 0 | 2.458333 | 0.883 | 0.643 | 0 | ENSG00000237643 |
| 3 | TAC1 | 0 | 3.921941 | 0.428 | 0.109 | 0 | ENSG00000006128 |
| 3 | CPA4 | 0 | 2.247372 | 0.266 | 0.062 | 0 | ENSG00000128510 |
| 3 | ANXA10 | 0 | 2.430645 | 0.302 | 0.089 | 0 | ENSG00000109511 |
| 4 | RXRG | 0 | 6.361009 | 0.399 | 0.005 | 0 | ENSG00000143171 |
| 4 | ADCYAP1 | 0 | 5.879420 | 0.454 | 0.014 | 0 | ENSG00000141433 |
| 4 | PCSK1 | 0 | 4.752696 | 0.475 | 0.018 | 0 | ENSG00000175426 |
| 4 | TAGLN3 | 0 | 5.974577 | 0.464 | 0.017 | 0 | ENSG00000144834 |
| 4 | ASB9 | 0 | 6.459432 | 0.492 | 0.024 | 0 | ENSG00000102048 |
| 5 | MALL | 0 | 5.182514 | 0.982 | 0.118 | 0 | ENSG00000144063 |
| 5 | ALDH1A3 | 0 | 4.547304 | 0.945 | 0.129 | 0 | ENSG00000184254 |
| 5 | FLNA | 0 | 4.041860 | 0.982 | 0.158 | 0 | ENSG00000196924 |
| 5 | S100A14 | 0 | 3.988552 | 0.936 | 0.135 | 0 | ENSG00000189334 |
| 5 | TSPAN15 | 0 | 3.189382 | 0.955 | 0.157 | 0 | ENSG00000099282 |
| 6 | IRX2-DT | 0 | 6.099829 | 0.833 | 0.023 | 0 | ENSG00000186493 |
| 6 | F10 | 0 | 5.597897 | 0.833 | 0.031 | 0 | ENSG00000126218 |
| 6 | CRH | 0 | 6.225981 | 0.567 | 0.012 | 0 | ENSG00000147571 |
| 6 | SMIM24 | 0 | 5.272775 | 0.817 | 0.043 | 0 | ENSG00000095932 |
| 6 | PCSK2 | 0 | 5.318404 | 0.933 | 0.067 | 0 | ENSG00000125851 |
| 7 | MALAT1 | 0 | 4.024411 | 1.000 | 0.992 | 0 | ENSG00000251562 |
| 7 | NEAT1 | 0 | 4.437703 | 1.000 | 0.947 | 0 | ENSG00000245532 |
| 7 | LENG8 | 0 | 2.830771 | 1.000 | 0.752 | 0 | ENSG00000167615 |
| 7 | RBM25 | 0 | 2.400703 | 0.982 | 0.569 | 0 | ENSG00000119707 |
| 7 | MEG3 | 0 | 2.917470 | 0.912 | 0.474 | 0 | ENSG00000214548 |
# export the top 5 highly expressed genes each cluster to a xlsx file
if (!file.exists("Pancreas_pbmc_markers_top_5.xlsx")) {
write_xlsx(Pancreas_pbmc_markers_top_5,
"Pancreas_pbmc_markers_top_5.xlsx")
}By looking up the highly expressed genes in each cluster online and in the literature, I can roughly identify the cell types in the Pancreas data. I found the following cell types:
| Cluster | Cell Type |
|---|---|
| 0 | Acinar Cell |
| 1 | Acinar Cell |
| 2 | Ductal Cell |
| 3 | Acinar Cell |
| 4 | Beta Cell |
| 5 | Ductal Cell |
| 6 | Alpha Cell |
| 7 | Mixed Immune Cell |
Since I already have the primary cell types for the Pancreas data from the top 5 highly expressed genes in each cluster, we can verfiy the cell types by gene markers from the literature or from online databases (e.g., CellMarker 2.0, PanglaoDB)
And I found the following gene markers for the cell types in the Pancreas data:
| Cell Type | Gene Markers |
|---|---|
| Acinar Cell | PRSS1, CELA3A, CELA3B, CPA1, CTRB1, CTRB2, REG1A, REG1B, REG3A, REG3G, REG4 |
| Ductal Cell | KRT19, KRT7, KRT8, KRT18, KRT20, MUC1, MUC6, MUC5AC, MUC5B, SOX9 |
| Beta Cell | INS, GCG, IAPP, PCSK1, PCSK2, SLC2A2, SLC30A8, NKX6-1, PDX1, MAFA |
| Alpha Cell | GCG, ARX, IRX2, IRX3, IRX5, IRX6, IRX1, IRX4, IRX7, IRX8, IRX9 |
| Mixed Immune Cell | CD3D, CD3E, CD3G, CD4, CD8A, CD8B, CD19, CD20, CD22, CD79A, CD79B, CD34, CD38 |
Then plot feature plots for the classic gene markers for each cell type to verify the cell types in the Pancreas data.
- Acinar Cell: PRSS1, CELA3A, CELA3B, CPA1, CTRB1, CTRB2, REG1A, REG1B, REG3A, REG3G, REG4
Acinar_cell_markers <- c("PRSS1", "CELA3A", "CELA3B", "CPA1")
plot_feature_plots(Pancreas_pbmc_UMAP,
Acinar_cell_markers)$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000142789 |CELA3A |
|ENSG00000219073 |CELA3B |
|ENSG00000091704 |CPA1 |
|ENSG00000274247 |PRSS1 |
|ENSG00000204983 |PRSS1 |
$feature_plot
$violin_plot
- Ductal Cell: KRT19, KRT7, KRT8, KRT18, KRT20, MUC1, MUC6, MUC5AC, MUC5B, SOX9
Ductal_cell_markers <- c("KRT19", "KRT7", "KRT8", "KRT18", "MUC5AC")
plot_feature_plots(Pancreas_pbmc_UMAP,
Ductal_cell_markers)$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000111057 |KRT18 |
|ENSG00000171345 |KRT19 |
|ENSG00000135480 |KRT7 |
|ENSG00000170421 |KRT8 |
|ENSG00000215182 |MUC5AC |
|ENSG00000291363 |MUC5AC |
$feature_plot
$violin_plot
- Beta Cell: INS, GCG, IAPP, PCSK1, PCSK2, SLC2A2, SLC30A8, NKX6-1, PDX1, MAFA
Beta_cell_markers <- c("INS", "GCG", "IAPP", "PCSK1", "PCSK2")
plot_feature_plots(Pancreas_pbmc_UMAP,
Beta_cell_markers)$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000115263 |GCG |
|ENSG00000121351 |IAPP |
|ENSG00000254647 |INS |
|ENSG00000175426 |PCSK1 |
|ENSG00000125851 |PCSK2 |
$feature_plot
$violin_plot
- Alpha Cell: GCG, ARX, IRX2, IRX3, IRX5, IRX6, IRX1, IRX4, IRX7, IRX8, IRX9
Alpha_cell_markers <- c("GCG", "ARX", "IRX2", "IRX3")
plot_feature_plots(Pancreas_pbmc_UMAP,
Alpha_cell_markers)$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000004848 |ARX |
|ENSG00000115263 |GCG |
|ENSG00000170561 |IRX2 |
|ENSG00000177508 |IRX3 |
$feature_plot
$violin_plot
- Mixed Immune Cell: MALAT1, NEAT1
Mixed_immune_cell_markers <- c("MALAT1", "NEAT1")
plot_feature_plots(Pancreas_pbmc_UMAP,
Mixed_immune_cell_markers)$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000251562 |MALAT1 |
|ENSG00000245532 |NEAT1 |
$feature_plot
$violin_plot
Annotate the cell types
Annotate the cell types for the Pancreas data based on the gene markers and the feature plots.
# Annotate the cell types for the Pancreas data
# manually rename the seurat_clusters to the cell types
Pancreas_pbmc_UMAP_new_meta <- Pancreas_pbmc_UMAP@meta.data |>
# change the seurat_clusters to cell types
dplyr::mutate(cell_type = case_when(
seurat_clusters == 0 ~ "Acinar Cell",
seurat_clusters == 1 ~ "Acinar Cell",
seurat_clusters == 2 ~ "Ductal Cell",
seurat_clusters == 3 ~ "Acinar Cell",
seurat_clusters == 4 ~ "Beta Cell",
seurat_clusters == 5 ~ "Ductal Cell",
seurat_clusters == 6 ~ "Alpha Cell",
seurat_clusters == 7 ~ "Mixed Immune Cell")) |>
dplyr::mutate(cell_type = as.factor(cell_type))
# add the cell types to the UMAP object
Pancreas_pbmc_UMAP <- AddMetaData(
Pancreas_pbmc_UMAP,
metadata = Pancreas_pbmc_UMAP_new_meta,
col.name = "cell_type")
# export the UMAP object to a rds file
saveRDS(Pancreas_pbmc_UMAP, "Pancreas_pbmc_UMAP.rds")3.7.0.1 Summary Plots
Plot the UMAP with the cell types for the Pancreas data.
DimPlot(Pancreas_pbmc_UMAP,
reduction = "umap",
label = TRUE,
group.by = "cell_type",
label.size = 4,
repel = TRUE,
alpha = 0.5) +
ggtitle("Pancreas Cell Types") +
NoLegend()Plot the heatmap
# plot the heatmap
DoHeatmap(
Pancreas_pbmc_UMAP,
features = Pancreas_pbmc_markers_top_5$gene,
label = TRUE,
group.by = "cell_type",
size = 4) |>
suppressWarnings() +
NoLegend() +
rremove("y.text") New marker for alpha cells
FeaturePlot(
Pancreas_pbmc_UMAP,
features = c("ENSG00000126218"),
reduction = "umap",
slot = "data")4 Conclusion
In this final project, I have analyzed the BoneMarrow1, BoneMarrow2 and Pancreas data using the Seurat package in R. I have identified 14 cell types in the BoneMarrow1 data, 11 cell types in the BoneMarrow2 data, and 5 cell types in the Pancreas data. I have also identified the gene markers for each cell type and annotated the cell types based on the gene markers and the feature plots. Finally, I have plotted the heatmap for the top 5 gene markers for each cell type in the Pancreas data and identified a new marker for alpha cells. The results of this analysis can be used to better understand the cell types in the BoneMarrow1, BoneMarrow2 and Pancreas data and to further investigate the role of these cell types in health and disease.